1. Preparation

Set working directory

# change this value if needed
setwd("/Users/gabays/github/RiseAndFall")
getwd()
## [1] "/Users/gabays/github/RiseAndFall"

Load packages

if(!require("ggplot2")){
  install.packages("ggplot2")
  library(ggplot2)
}
if(!require("roll")){
  install.packages("roll")
  library(roll)
}
if(!require("purrr")){
  install.packages("purrr")
  library(purrr)
}
if(!require("stylo")){
  install.packages("stylo")
  library(stylo)
}
if(!require("dplyr")){
  install.packages("dplyr")
  library(dplyr)
}

Load external functions

source("./R/functions.R")

Load previously computed data

#load("./R/RiseAndFall.RData")

2. Set up

2.1 As 3-grams

Get Metadata and corpus as 3-grams

#Metadata
metadata = read.csv(file="./metadata.csv", sep=",", header = TRUE, row.names = 1, stringsAsFactors=T)
metadata = metadata[sort(rownames(metadata)), ]
#data
data =  read.csv(file="./feats_tests_n3_k_5000.csv", sep=",", header = TRUE, row.names = 1, stringsAsFactors=T)
data_stop =  read.csv(file="./feats_tests_n1_k_5000.csv", sep=",", header = TRUE, row.names = 1, stringsAsFactors=T)

We compute de distribution: which 3grams are relevant?

dist = colSums(data[,3:ncol(data)])
plot(dist,type = "l")

We work with the 2000 most frequent 3grams – after that the curve is totally flat

d <- data[,3:2003]
d_stop <- data_stop[,3:length(data_stop)]

We normalise the vectors

d <- t(d)
#normalisation
d <- normalisations(d)
#stopwords
d_stop <- t(d_stop)
d_stop <- normalisations(d_stop)

We add the metadata

# we add the metadata
control<-rbind(rownames(metadata),colnames(d))
# we control that we have similar values
head(t(control))
##      [,1]                                  
## [1,] "abeille-argelie"                     
## [2,] "abeille-coriolan"                    
## [3,] "abeille-lyncee"                      
## [4,] "about-risette"                       
## [5,] "adenis-homme-qui-ne-peut-pas-siffler"
## [6,] "aigueberre-avare-amoureux"           
##      [,2]                                      
## [1,] "abeille-argelie.txt"                     
## [2,] "abeille-coriolan.txt"                    
## [3,] "abeille-lyncee.txt"                      
## [4,] "about-risette.txt"                       
## [5,] "adenis-homme-qui-ne-peut-pas-siffler.txt"
## [6,] "aigueberre-avare-amoureux.txt"
tail(t(control))
##         [,1]                                
## [1511,] "voltaire-tanis-zelide"             
## [1512,] "voltaire-zaire"                    
## [1513,] "voltaire-zulime"                   
## [1514,] "vondrebeck-alard-forces-de-l-amour"
## [1515,] "zola-madeleine"                    
## [1516,] "zola-therese-raquin"               
##         [,2]                                    
## [1511,] "voltaire-tanis-zelide.txt"             
## [1512,] "voltaire-zaire.txt"                    
## [1513,] "voltaire-zulime.txt"                   
## [1514,] "vondrebeck-alard-forces-de-l-amour.txt"
## [1515,] "zola-madeleine.txt"                    
## [1516,] "zola-therese-raquin.txt"

2.2 As plain texts (beta)

Alternative: loading plays in plain text (for later)

corpus<-list()
#Get the list of all txt files
TxtFiles <- list.files(path = "txt",pattern = "txt$")
#Loop over all files
for(x in TxtFiles){
  #Get the path
  FullPath <- paste("txt", x, sep="/")
  #Get the name (drop .txt extension)
# TextName <- sub("\\.txt", "", x)
  #Get the text in the file
#  FullText <- suppressWarnings(read.csv(FullPath, header = FALSE, sep = "\n", fileEncoding="UTF-8"))
  FullText <- suppressWarnings(readLines(FullPath))
    #Append the text to the corpus
  corpus<-append(corpus,FullText)
}
View(corpus)

We transform the loaded texts into minable data

#tokenisation
corpus.tok = lapply(corpus, txt.to.words2)
#Counting frequency of tokens
corpus.tok.list = make.frequency.list(corpus.tok)
#Transform frequency into a table
corpus.tok.list.freq=make.table.of.frequencies(corpus.tok, corpus.tok.list, relative = F)
#I name columns
row.names(corpus.tok.list.freq)=TxtFiles
#I save a copy
write.csv(corpus.tok.list.freq, file = "corpus.bench.tok.list.freq.csv",row.names=TRUE)
#Convert table into dataframe
corpus.tok.list.freq = as.data.frame(read.csv(file="corpus.bench.tok.list.freq.csv", sep = ",", header = TRUE, row.names=1, quote = '\"'))
#transposition (rows become columns)
corpus.tok.list.freq = t(corpus.tok.list.freq)
#normalisation
corpus.tok.list.freq = normalisations(corpus.tok.list.freq)
#Displaying the dataframe
View(corpus.tok.list.freq)

3 Preparation

3.1 Generic value of features

3.1.1 Generic value of stopwords

We control that stopwords do identify genres

distToTragedy <- DistToCentroid(d_stop, centroid = rowMeans(d_stop[, metadata[, "Genre"] == "tragedy" & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]))

distToComedy <- DistToCentroid(d_stop, centroid = rowMeans(d_stop[, metadata[, "Genre"] == "comedy" & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]))

m <- cbind(distToTragedy, distToComedy)
colnames(m) <- c("DistTragedyCentroid","DistComedyCentroid")

#create transparent color
mycol <- t_col("white", perc = 100, name = "transparent")

#fix colors
colors <- metadata[, "Genre"]
levels(colors) = c("transparent","blue", "transparent", "transparent", "transparent", "transparent", "transparent", "transparent", "transparent", "red", "transparent", "transparent")
#plot
plot(m, col=as.character(colors))
legend(18, 38, c("comedy", "tragedy"),col=c("blue", "red"),pch=1)
grid(nx = NULL, ny = NULL,
     lty = 1,      # Grid line type
     col = "gray", # Grid line color
     lwd = 1)      # Grid line width

# text(m[, 1], m[, 2], labels=row.names(m), cex= 0.1) # To add the name of the plays

#Save the image
png("./R/images/clusters_stop.png", width = 2500, height = 2000, res=300) 
plot(m, col=as.character(colors))
legend(18, 38, c("comedy", "tragedy"),col=c("blue", "red"),pch=1)
grid(nx = NULL, ny = NULL, lty = 1, col = "gray", lwd = 1)
dev.off() 
## quartz_off_screen 
##                 2

We control tragedies classified with comedies:

literaryGenre <- metadata[, "Genre"]
check_anomalies<-as.data.frame(cbind(m,literaryGenre))
subset(check_anomalies, literaryGenre == '10' & DistTragedyCentroid >10)
##                                    DistTragedyCentroid DistComedyCentroid
## aubignac-pucelle-prose.txt                    10.12718           9.878585
## delavigne-famille-temps-luther.txt            10.09809          10.880326
## mathieu-magicienne-etrangere.txt              10.57133          11.581096
## piron-nouvelle-messaline.txt                  10.26928          10.859707
## puget-de-la-serre-pandoste-ii.txt             10.57675          10.267505
## puget-de-la-serre-thesee.txt                  10.65710           9.905667
## puget-de-la-serre-thomas-morus.txt            10.49597          10.588407
## viau-pyrame.txt                               10.17409          11.014445
##                                    literaryGenre
## aubignac-pucelle-prose.txt                    10
## delavigne-famille-temps-luther.txt            10
## mathieu-magicienne-etrangere.txt              10
## piron-nouvelle-messaline.txt                  10
## puget-de-la-serre-pandoste-ii.txt             10
## puget-de-la-serre-thesee.txt                  10
## puget-de-la-serre-thomas-morus.txt            10
## viau-pyrame.txt                               10

We control comedies classified with tragedies:

literaryGenre <- metadata[, "Genre"]
check_anomalies<-as.data.frame(cbind(m,literaryGenre))
subset(check_anomalies, literaryGenre == '2' & DistTragedyCentroid <9)
##                                        DistTragedyCentroid DistComedyCentroid
## archambault-etrennes.txt                          8.433885           8.781233
## chapuzeau-geneve-delivree.txt                     8.684980          11.118776
## cinq-auteurs-comedie-des-tuileries.txt            8.769898          10.867372
## colle-alfonse.txt                                 8.727235          11.825259
## corneillep-illusion-comique.txt                   8.564171          11.183531
## corneillep-melite.txt                             8.865439          10.693274
## corneillet-geolier-de-sois-meme.txt               7.810412           9.790915
## corneillet-illustres-ennemis.txt                  8.454825          10.072041
## cubieres-palmezeaux-lacrymanie.txt                8.516884          11.059902
## labaume-messe-de-gnide.txt                        8.183805          10.693331
## moline-legislatrices.txt                          8.180516           7.169399
## rotrou-bague-de-l-oubli.txt                       8.671355          10.658463
## rotrou-belle-alphrede.txt                         8.229971          11.400531
## rotrou-sosies.txt                                 8.590999          10.306664
## saint-roman-dialogue.txt                          8.877755          11.704875
##                                        literaryGenre
## archambault-etrennes.txt                           2
## chapuzeau-geneve-delivree.txt                      2
## cinq-auteurs-comedie-des-tuileries.txt             2
## colle-alfonse.txt                                  2
## corneillep-illusion-comique.txt                    2
## corneillep-melite.txt                              2
## corneillet-geolier-de-sois-meme.txt                2
## corneillet-illustres-ennemis.txt                   2
## cubieres-palmezeaux-lacrymanie.txt                 2
## labaume-messe-de-gnide.txt                         2
## moline-legislatrices.txt                           2
## rotrou-bague-de-l-oubli.txt                        2
## rotrou-belle-alphrede.txt                          2
## rotrou-sosies.txt                                  2
## saint-roman-dialogue.txt                           2

3.1.2 Generic value of 3-grams

We control that 3-grams do identify genres

distToTragedy <- DistToCentroid(d, centroid = rowMeans(d[, metadata[, "Genre"] == "tragedy"]))

distToComedy <- DistToCentroid(d, centroid = rowMeans(d[, metadata[, "Genre"] == "comedy"]))

m <- cbind(distToTragedy, distToComedy)
colnames(m) <- c("DistTragedyCentroid","DistComedyCentroid")

#create transparent color
mycol <- t_col("white", perc = 100, name = "transparent")

#fix colors
colors <- metadata[, "Genre"]
levels(colors) = c("transparent","blue", "transparent", "transparent", "transparent", "transparent", "transparent", "transparent", "transparent", "red", "transparent", "transparent")
#plot
plot(m, col=as.character(colors))
legend(18, 38, c("comedy", "tragedy"),col=c("blue", "red"),pch=1)
grid(nx = NULL, ny = NULL,
     lty = 1,      # Grid line type
     col = "gray", # Grid line color
     lwd = 1)      # Grid line width

# text(m[, 1], m[, 2], labels=row.names(m), cex= 0.1) # To add the name of the plays

#Save the image
png("./R/images/clusters_3grams.png", width = 2500, height = 2000, res=300) 
plot(m, col=as.character(colors))
legend(18, 38, c("comedy", "tragedy"),col=c("blue", "red"),pch=1)
grid(nx = NULL, ny = NULL, lty = 1, col = "gray", lwd = 1)
dev.off() 
## quartz_off_screen 
##                 2

Results are more precise than with stopwords. Now we do the same, but only with plays written between 1500 and 1800:

#Centroid of tragedies
distToTragedy <- DistToCentroid(d, centroid = rowMeans(d[, metadata[, "Genre"] == "tragedy" & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]))

#Centroid of comedies
distToComedy <- DistToCentroid(d, centroid = rowMeans(d[, metadata[, "Genre"] == "comedy" & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]))

#On dataframe with two values
m <- cbind(distToTragedy, distToComedy)
colnames(m) <- c("DistTragedyCentroid","DistComedyCentroid")

#getting rid of too old/too recent plays
m<-t(m)
m_clean<-m[,  metadata[, "Date"] <1800 & metadata[, "Date"] >1500]
m_clean<-t(m_clean)
m_clean2<-na.omit(m_clean)
metadata_clean <- filter(metadata, Date<"1800" & Date>"1500" )

#create transparent color
mycol <- t_col("white", perc = 100, name = "transparent")

#fix colors
colors <- metadata_clean[, "Genre"]
levels(colors) = c("transparent","blue", "transparent", "transparent", "transparent", "transparent", "transparent", "transparent", "transparent", "red", "transparent", "transparent")
#plot
plot(m_clean2, col=as.character(colors))
legend(18, 38, c("comedy", "tragedy"),col=c("blue", "red"),pch=1)
grid(nx = NULL, ny = NULL,
     lty = 1,      # Grid line type
     col = "gray", # Grid line color
     lwd = 1)      # Grid line width

# text(m[, 1], m[, 2], labels=row.names(m), cex= 0.1) # To add the name of the plays

#Save the image
png("./R/images/clusters_3grams.png", width = 2500, height = 2000, res=300) 
plot(m_clean2, col=as.character(colors))
legend(18, 38, c("comedy", "tragedy"),col=c("blue", "red"),pch=1)
grid(nx = NULL, ny = NULL, lty = 1, col = "gray", lwd = 1)
dev.off() 
## quartz_off_screen 
##                 2

We control tragedies classified with comedies:

literaryGenre <- metadata_clean[, "Genre"]
check_anomalies<-as.data.frame(cbind(m_clean2,literaryGenre))
subset(check_anomalies, literaryGenre == '10' & DistTragedyCentroid >33)
##                                      DistTragedyCentroid DistComedyCentroid
## bievre-vercingentorixe.txt                      33.42857           35.06555
## champrepus-ulysse.txt                           35.02691           36.81744
## deshoulieres-mort-de-cochon.txt                 34.30683           31.49414
## donneau-de-vise-amours-du-soleil.txt            33.29494           33.16815
## puget-de-la-serre-pandoste-ii.txt               34.99850           34.82156
## puget-de-la-serre-thesee.txt                    33.80002           31.19131
## soret-ceciliade.txt                             34.31839           34.24210
##                                      literaryGenre
## bievre-vercingentorixe.txt                      10
## champrepus-ulysse.txt                           10
## deshoulieres-mort-de-cochon.txt                 10
## donneau-de-vise-amours-du-soleil.txt            10
## puget-de-la-serre-pandoste-ii.txt               10
## puget-de-la-serre-thesee.txt                    10
## soret-ceciliade.txt                             10

We control comedies classified with tragedies:

literaryGenre <- metadata_clean[, "Genre"]
check_anomalies<-as.data.frame(cbind(m_clean2,literaryGenre))
subset(check_anomalies, literaryGenre == '2' & DistTragedyCentroid <32)
##                                        DistTragedyCentroid DistComedyCentroid
## brosse-aveugle-clairvoyant.txt                    31.68002           32.72420
## cailleau-tragedies-voltaire.txt                   31.78211           26.69976
## colle-alfonse.txt                                 29.49724           36.12450
## corneillep-melite.txt                             31.31384           32.51813
## corneillep-place-royale.txt                       31.66732           33.92384
## corneillep-suivante.txt                           30.43139           31.51745
## corneillet-charme-de-la-voix.txt                  29.28797           30.36489
## corneillet-geolier-de-sois-meme.txt               29.81206           32.83324
## corneillet-illustres-ennemis.txt                  29.67439           34.26227
## doruxigne-alzate.txt                              28.83935           34.69877
## dumaniant-francais-en-huronie-1787.txt            31.58426           33.95121
## lesage-dorneval-ile-gougou.txt                    31.98525           21.88079
## moline-legislatrices.txt                          31.09307           32.53820
## ouville-soupcons.txt                              30.78827           30.35115
## rotrou-bague-de-l-oubli.txt                       31.94664           34.41904
## rotrou-belle-alphrede.txt                         29.02633           36.57280
## rotrou-sosies.txt                                 30.70770           34.52032
## scarron-boutades-matamore.txt                     30.97121           30.95656
## scudery-fils-suppose.txt                          31.72470           34.40757
## villiers-critique-du-tartuffe.txt                 31.88891           29.41916
##                                        literaryGenre
## brosse-aveugle-clairvoyant.txt                     2
## cailleau-tragedies-voltaire.txt                    2
## colle-alfonse.txt                                  2
## corneillep-melite.txt                              2
## corneillep-place-royale.txt                        2
## corneillep-suivante.txt                            2
## corneillet-charme-de-la-voix.txt                   2
## corneillet-geolier-de-sois-meme.txt                2
## corneillet-illustres-ennemis.txt                   2
## doruxigne-alzate.txt                               2
## dumaniant-francais-en-huronie-1787.txt             2
## lesage-dorneval-ile-gougou.txt                     2
## moline-legislatrices.txt                           2
## ouville-soupcons.txt                               2
## rotrou-bague-de-l-oubli.txt                        2
## rotrou-belle-alphrede.txt                          2
## rotrou-sosies.txt                                  2
## scarron-boutades-matamore.txt                      2
## scudery-fils-suppose.txt                           2
## villiers-critique-du-tartuffe.txt                  2

3.2 Within inertia of the two clusters

Warning: on all the dataset, not on plays written between 1500 and 1800.

#Get all possible genres
levels(metadata[, "Genre"])
##  [1] ""            "comedy"      "dialogue"    "drama"       "farce"      
##  [6] "monologue"   "opera"       "proverbe"    "saynete"     "tragedy"    
## [11] "tragicomedy" "vaudeville"
#Get genre of all plays
clusters <- metadata[, "Genre"]
levels(clusters) <- c(1:13)

#Compute inertia
clusterInertia(t(d), as.numeric(clusters))
##  [1] 198.08189 624.87127  23.60303  30.45398  10.32456  56.62311  11.61778
##  [8] 112.83419  21.57262 212.44350  61.53742   5.48289

3.3 Centroid distance

Comedy (with 3-grams)

comedies = d[, metadata[, "Genre"] == "comedy"]
comediesToCentroid = DistToCentroid(comedies, method="manhattan")

summary(comediesToCentroid)
##  DistToCentroid 
##  Min.   :20.97  
##  1st Qu.:30.11  
##  Median :31.50  
##  Mean   :31.36  
##  3rd Qu.:32.80  
##  Max.   :37.48
boxplot(comediesToCentroid)

# Most typical comedies
head(comediesToCentroid[order(comediesToCentroid[, 1]),])
##        genlis-belle-et-la-bete.txt     lesage-dorneval-ile-gougou.txt 
##                           20.96861                           21.59379 
##             liborliere-cloison.txt         dancourt-mari-retrouve.txt 
##                           25.91855                           26.14891 
##     palissot-barbier-de-bagdad.txt dancourt-impromptu-de-garnison.txt 
##                           26.17214                           26.27755
# Less typical comedies
tail(comediesToCentroid[order(comediesToCentroid[, 1]),])
##                 colle-alfonse.txt     chapuzeau-geneve-delivree.txt 
##                          36.09004                          36.64682 
##   corneillep-illusion-comique.txt quinault-comedie-sans-comedie.txt 
##                          36.64705                          36.72005 
##         rotrou-belle-alphrede.txt                colle-cocatrix.txt 
##                          36.88809                          37.47806

Tragedies (with 3-grams)

tragedies = d[, metadata[, "Genre"] == "tragedy"]
tragediesToCentroid = DistToCentroid(tragedies, method="manhattan")

summary(tragediesToCentroid)
##  DistToCentroid 
##  Min.   :18.76  
##  1st Qu.:24.87  
##  Median :26.27  
##  Mean   :26.82  
##  3rd Qu.:28.29  
##  Max.   :35.04
boxplot(tragediesToCentroid)

# Most typical tragedies
head(tragediesToCentroid[order(tragediesToCentroid[, 1]),])
##  la-thuilerie-soliman.txt barbier-mort-de-cesar.txt      saurin-spartacus.txt 
##                  18.76257                  20.31262                  21.40985 
##       genest-zelonide.txt      pellegrin-tibere.txt     voltaire-mariamne.txt 
##                  21.86938                  22.19090                  22.40103
# Less typical tragedies
tail(tragediesToCentroid[order(tragediesToCentroid[, 1]),])
##       puget-de-la-serre-thesee.txt    deshoulieres-mort-de-cochon.txt 
##                           33.84736                           34.30966 
##                soret-ceciliade.txt delavigne-famille-temps-luther.txt 
##                           34.31754                           34.70682 
##              champrepus-ulysse.txt  puget-de-la-serre-pandoste-ii.txt 
##                           35.01097                           35.03667

4 Generic evolution

4.1 Tragedy

4.1.1 With 3-grams

tragedies <- d[, metadata[, "Genre"] == 'tragedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]
tragediesToCentroid = DistToCentroid(tragedies, method="manhattan")
evoCentroid<-cbind(tragediesToCentroid,metadata[metadata$Genre=='tragedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500, ])
vizEvoCentroid <- ggplot(data = evoCentroid, mapping = aes(x = Date, y = DistToCentroid))+
  geom_point(stat = "summary", fun = "mean")+ 
  geom_smooth(method = loess, size = 1)+ theme_bw()
ggsave("./R/images/tragedy3grams.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

4.1.2 With words

tragedies <- corpus.tok.list.freq[, metadata[, "Genre"] == 'tragedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]
tragediesToCentroid = DistToCentroid(tragedies, method="manhattan")
evoCentroid<-cbind(tragediesToCentroid,metadata[metadata$Genre=='tragedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500, ])
vizEvoCentroid <- ggplot(data = evoCentroid, mapping = aes(x = Date, y = DistToCentroid))+
  geom_point(stat = "summary", fun = "mean")+ 
  geom_smooth(method = loess, size = 1)+ theme_bw()
ggsave("./R/images/tragedyWords.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

4.1.3 With stopwords

tragedies <- d_stop[, metadata[, "Genre"] == 'tragedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]
tragediesToCentroid = DistToCentroid(tragedies, method="manhattan")
evoCentroid<-cbind(tragediesToCentroid,metadata[metadata$Genre=='tragedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500, ])
vizEvoCentroid <- ggplot(data = evoCentroid, mapping = aes(x = Date, y = DistToCentroid))+
  geom_point(stat = "summary", fun = "mean")+ 
  geom_smooth(method = loess, size = 1)+ theme_bw()
ggsave("./R/images/tragedyStop.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

4.2 Comedy

4.2.1 With 3-grams

tragedies <- d[, metadata[, "Genre"] == 'comedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]
tragediesToCentroid = DistToCentroid(tragedies, method="manhattan")
evoCentroid<-cbind(tragediesToCentroid,metadata[metadata$Genre=='comedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500, ])
vizEvoCentroid <- ggplot(data = evoCentroid, mapping = aes(x = Date, y = DistToCentroid))+
  geom_point(stat = "summary", fun = "mean")+ 
  geom_smooth(method = loess, size = 1)+ theme_bw()
ggsave("./R/images/comedy3grams.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

4.2.2 With words

tragedies <- corpus.tok.list.freq[, metadata[, "Genre"] == 'comedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]
tragediesToCentroid = DistToCentroid(tragedies, method="manhattan")
evoCentroid<-cbind(tragediesToCentroid,metadata[metadata$Genre=='comedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500, ])
vizEvoCentroid <- ggplot(data = evoCentroid, mapping = aes(x = Date, y = DistToCentroid))+
  geom_point(stat = "summary", fun = "mean")+ 
  geom_smooth(method = loess, size = 1)+ theme_bw()
ggsave("./R/images/comedyWords.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

4.2.3 With stopwords

tragedies <- d_stop[, metadata[, "Genre"] == 'comedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500]
tragediesToCentroid = DistToCentroid(tragedies, method="manhattan")
evoCentroid<-cbind(tragediesToCentroid,metadata[metadata$Genre=='comedy' & metadata[, "Date"] <1800 & metadata[, "Date"] >1500, ])
vizEvoCentroid <- ggplot(data = evoCentroid, mapping = aes(x = Date, y = DistToCentroid))+
  geom_point(stat = "summary", fun = "mean")+ 
  geom_smooth(method = loess, size = 1)+ theme_bw()
ggsave("./R/images/comedyStop.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5 Distance to auctorial centroid vs generic centroid

5.1 Comedy

We select all the authors with more than 3 plays in the dataset

#all authors
n_occur <- data.frame(table(metadata$Author))
# Remove forst row (author empty value)
n_occur<-n_occur[-1,]

#more than 4 times
multiples <- n_occur[n_occur$Freq > 2,]
#transform df into vector
multiples <- multiples$Var1
#number of authors
length(multiples)
## [1] 113

We select all the authors who have more than 1 comedy

authorsSelected=c()
for (x in multiples){
  results <- as.data.frame(metadata[metadata$Genre == 'comedy' & metadata$Author == x,])
  if(nrow(results)>2){
  authorsSelected<-append(authorsSelected,x)
  }
}

We compute the distance to the centroid of the author, the centroid of the genre and the distance between both:

#create an empty df to store results
df_comedy=data.frame(matrix(ncol = 7, nrow = 0))

#get name of plays
plays<-rownames(metadata)
#prepare to iterate
incr<-0

#loop over plays
for (x in plays){
  #increment
  incr<-incr+1
  #get author name
  author <- metadata[incr,2]
  #get genre
  genre<-metadata[incr,4]
  #get date
  date<-metadata[incr,3]
  #if author has written multiple texts present in the corpus and genre is known
   if (author %in% authorsSelected==TRUE & genre =='comedy' & date>1500 & date<1800){
    #get the data of all the plays of an author
    authorData = d[, metadata[, "Author"] == author & metadata[, "Genre"] == genre]
#    authorData = d[, metadata[, "Author"] == author]
    #get all the plays of the same genre
#    genreData = d[, metadata[, "Genre"] == genre & metadata[, "Date"]]
    genreData = d[, metadata[, "Genre"] == genre & metadata[, "Date"] <date+30 & metadata[, "Date"] >date-30]
    #compute distance to centroid of the author
    authorToCentroid = as.data.frame(DistToCentroid(authorData, method="manhattan"))
    #compute distance to centroid of the genre
    genreToCentroid = as.data.frame(DistToCentroid(genreData, method="manhattan"))
    #compute the mean of the centroid to the genre
    meanGenre = mean(as.numeric(genreToCentroid$DistToCentroid))
    #get the distance of the play to the author
    playDistAuthor<-authorToCentroid[x,]
    #get the distance of the play to the genre
    playDistGenre<-genreToCentroid[x,]
    #diff author genre (rounded)
    distance<- round(euclidean(playDistGenre,playDistAuthor), digits=3)
    #distance<-round(playDistGenre-playDistAuthor, digits = 2)
    #save the result if value is not zero (problem occurred)
    control<-playDistAuthor!=''
    if (is.na(control)==FALSE){
      results<-c(x,author,genre,date,distance,playDistAuthor,playDistGenre,meanGenre)
      df_comedy<-rbind(df_comedy,results)
    }
  }
}
#give a name to the columns
x <- c("play","author", "genre","date","distance", "distance2author","distance2genre","meanGenre")
labelPoints<-rownames(df_comedy)
colnames(df_comedy) <- x

5.1.1 Distance to author

#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author))+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  theme_bw() +geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_comedy_labels.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_comedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.1.1 Marivaux

df_comedy$color<-df_comedy$author
df_comedy$color<-as.numeric(df_comedy$color)
df_comedy$color[df_comedy$color == 168] <- "blue"
df_comedy$color[df_comedy$color != "blue"] <- "white"
truc<-df_comedy$color
#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_comedy_marivaux.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.1.2 Voltaire

df_comedy$color<-df_comedy$author
df_comedy$color<-as.numeric(df_comedy$color)
df_comedy$color[df_comedy$color == 148] <- "blue"
df_comedy$color[df_comedy$color != "blue"] <- "white"
truc<-df_comedy$color
#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_comedy_voltaire.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.1.3 Boissy

df_comedy$color<-df_comedy$author
df_comedy$color<-as.numeric(df_comedy$color)
df_comedy$color[df_comedy$color == 249] <- "blue"
df_comedy$color[df_comedy$color != "blue"] <- "white"
truc<-df_comedy$color
#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_comedy_boissy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.1.4 Molière

df_comedy$color<-df_comedy$author
df_comedy$color<-as.numeric(df_comedy$color)
df_comedy$color[df_comedy$color == 184] <- "blue"
df_comedy$color[df_comedy$color != "blue"] <- "white"
truc<-df_comedy$color
#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_comedy_moliere.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.2 Distance to genre

#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author))+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre") +
  theme_bw() + geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_comedy_labels.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre") +
  theme_bw() #+ geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_comedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.2.1 Marivaux

df_comedy$color<-df_comedy$author
df_comedy$color<-as.numeric(df_comedy$color)
df_comedy$color[df_comedy$color == 168] <- "blue"
df_comedy$color[df_comedy$color != "blue"] <- "white"
truc<-df_comedy$color
#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_comedy_marivaux.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.2.2 Voltaire

df_comedy$color<-df_comedy$author
df_comedy$color<-as.numeric(df_comedy$color)
df_comedy$color[df_comedy$color == 148] <- "blue"
df_comedy$color[df_comedy$color != "blue"] <- "white"
truc<-df_comedy$color
#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_comedy_voltaire.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.2.3 Boissy

df_comedy$color<-df_comedy$author
df_comedy$color<-as.numeric(df_comedy$color)
df_comedy$color[df_comedy$color == 249] <- "blue"
df_comedy$color[df_comedy$color != "blue"] <- "white"
truc<-df_comedy$color
#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_comedy_boissy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.2.4 Molière

df_comedy$color<-df_comedy$author
df_comedy$color<-as.numeric(df_comedy$color)
df_comedy$color[df_comedy$color == 184] <- "blue"
df_comedy$color[df_comedy$color != "blue"] <- "white"
truc<-df_comedy$color
#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_comedy_moliere.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.1.3 Distance between the two centroids

#vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance), label=rownames(df_comedy)))+
vizEvoCentroid <- ggplot(data = df_comedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance")+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/spread_comedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

Controlling variance

# with 4 plays
#x <- 1:71
x <- 1:nrow(df_comedy)
y<-roll_var(as.numeric(df_comedy$distance), width = 30, min_obs = 1)
lo <- loess(y~x)
plot(x,y,xlab="index", ylab="variance")
lines(predict(lo), col='red', lwd=2)

png("./R/images/variance2.png", width = 850, height = 600) 
plot(x,y,xlab="index", ylab="variance")
lines(predict(lo), col='red', lwd=2)
dev.off() 
## quartz_off_screen 
##                 2

5.2 Tragedy

We select all the authors with more than 3 plays in the dataset

#all authors
n_occur <- data.frame(table(metadata$Author))
# Remove forst row (author empty value)
n_occur<-n_occur[-1,]

#more than 4 times
multiples <- n_occur[n_occur$Freq > 2,]
#transform df into vector
multiples <- multiples$Var1
#number of authors
length(multiples)
## [1] 113

We select all the authors who have more than 1 tragedy

authorsSelected=c()
for (x in multiples){
  results <- as.data.frame(metadata[metadata$Genre == 'tragedy' & metadata$Author == x,])
  if(nrow(results)>2){
  authorsSelected<-append(authorsSelected,x)
  }
}

We compute the distance to the centroid of the author, the centroid of the genre and the distance between both:

#create an empty df to store results
df_tragedy=data.frame(matrix(ncol = 7, nrow = 0))

#get name of plays
plays<-rownames(metadata)
#prepare to iterate
incr<-0

#loop over plays
for (x in plays){
  #increment
  incr<-incr+1
  #get author name
  author <- metadata[incr,2]
  #get genre
  genre<-metadata[incr,4]
  #get date
  date<-metadata[incr,3]
  #if author has written multiple texts present in the corpus and genre is known
   if (author %in% authorsSelected==TRUE & genre =='tragedy' & date>1500 & date<1800){
    #get the data of all the plays of an author
    authorData = d[, metadata[, "Author"] == author & metadata[, "Genre"] == genre]
#    authorData = d[, metadata[, "Author"] == author]
    #get all the plays of the same genre
#    genreData = d[, metadata[, "Genre"] == genre & metadata[, "Date"]]
    genreData = d[, metadata[, "Genre"] == genre & metadata[, "Date"] <date+30 & metadata[, "Date"] >date-30]
    #compute distance to centroid of the author
    authorToCentroid = as.data.frame(DistToCentroid(authorData, method="manhattan"))
    #compute distance to centroid of the genre
    genreToCentroid = as.data.frame(DistToCentroid(genreData, method="manhattan"))
    #compute the mean of the centroid to the genre
    meanGenre = mean(as.numeric(genreToCentroid$DistToCentroid))
    #get the distance of the play to the author
    playDistAuthor<-authorToCentroid[x,]
    #get the distance of the play to the genre
    playDistGenre<-genreToCentroid[x,]
    #diff author genre (rounded)
    distance<- round(euclidean(playDistGenre,playDistAuthor), digits=3)
    #distance<-round(playDistGenre-playDistAuthor, digits = 2)
    #save the result if value is not zero (problem occurred)
    control<-playDistAuthor!=''
    if (is.na(control)==FALSE){
      results<-c(x,author,genre,date,distance,playDistAuthor,playDistGenre,meanGenre)
      df_tragedy<-rbind(df_tragedy,results)
    }
  }
}
#give a name to the columns
x <- c("play","author", "genre","date","distance", "distance2author","distance2genre","meanGenre")
labelPoints<-rownames(df_tragedy)
colnames(df_tragedy) <- x

5.2.1 Distance to author

#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author))+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  theme_bw() +geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_tragedy_labels.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author), show.legend=FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_tragedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.2.1.1 Voltaire

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 148] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_tragedy_voltaire.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.2.1.2 Crébillon

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 131] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_tragedy_crebillon.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.2.1.2 Racine

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 185] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_tragedy_racine.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.2.2 Distance to genre

#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author))+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre") +
  theme_bw() +geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_tragedy_labels.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author), show.legend=FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre") +
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_tragedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.2.2.1 Voltaire

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 148] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_tragedy_voltaire.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.2.2.2 Crébillon

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 131] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_tragedy_crebillon.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.2.2.3 Racine

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 185] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_tragedy_racine.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.2.3 Distance between the two centroids

#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance")+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/spread_tragedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

Controlling variance

# with 4 plays
#x <- 1:71
x <- 1:nrow(df_tragedy)
y<-roll_var(as.numeric(df_tragedy$distance), width = 30, min_obs = 1)
lo <- loess(y~x)
plot(x,y,xlab="index", ylab="variance")
lines(predict(lo), col='red', lwd=2)

png("./R/images/variance2.png", width = 850, height = 600) 
plot(x,y,xlab="index", ylab="variance")
lines(predict(lo), col='red', lwd=2)
dev.off() 
## quartz_off_screen 
##                 2

5.3 Tragedy (EXPRIMENT MIXING STOPWORDS AND 3GRAMS)

We select all the authors with more than 3 plays in the dataset

#all authors
n_occur <- data.frame(table(metadata$Author))
# Remove forst row (author empty value)
n_occur<-n_occur[-1,]

#more than 4 times
multiples <- n_occur[n_occur$Freq > 2,]
#transform df into vector
multiples <- multiples$Var1
#number of authors
length(multiples)
## [1] 113

We select all the authors who have more than 1 tragedy

authorsSelected=c()
for (x in multiples){
  results <- as.data.frame(metadata[metadata$Genre == 'tragedy' & metadata$Author == x,])
  if(nrow(results)>2){
  authorsSelected<-append(authorsSelected,x)
  }
}

We compute the distance to the centroid of the author (with n-grams), the centroid of the genre (with stopwords) and the distance between both.

#create an empty df to store results
df_tragedy=data.frame(matrix(ncol = 7, nrow = 0))

#get name of plays
plays<-rownames(metadata)
#prepare to iterate
incr<-0

#loop over plays
for (x in plays){
  #increment
  incr<-incr+1
  #get author name
  author <- metadata[incr,2]
  #get genre
  genre<-metadata[incr,4]
  #get date
  date<-metadata[incr,3]
  #if author has written multiple texts present in the corpus and genre is known
   if (author %in% authorsSelected==TRUE & genre =='tragedy' & date>1500 & date<1800){
    #get the data of all the plays of an author
    authorData = d[, metadata[, "Author"] == author & metadata[, "Genre"] == genre]
#    authorData = d[, metadata[, "Author"] == author]
    #get all the plays of the same genre
#    genreData = d_stop[, metadata[, "Genre"] == genre & metadata[, "Date"]]
    genreData = d_stop[, metadata[, "Genre"] == genre & metadata[, "Date"] <date+30 & metadata[, "Date"] >date-30]
    #compute distance to centroid of the author
    authorToCentroid = as.data.frame(DistToCentroid(authorData, method="manhattan"))
    #compute distance to centroid of the genre
    genreToCentroid = as.data.frame(DistToCentroid(genreData, method="manhattan"))
    #compute the mean of the centroid to the genre
    meanGenre = mean(as.numeric(genreToCentroid$DistToCentroid))
    #get the distance of the play to the author
    playDistAuthor<-authorToCentroid[x,]
    #get the distance of the play to the genre
    playDistGenre<-genreToCentroid[x,]
    #diff author genre (rounded)
    distance<- round(euclidean(playDistGenre,playDistAuthor), digits=3)
    #distance<-round(playDistGenre-playDistAuthor, digits = 2)
    #save the result if value is not zero (problem occurred)
    control<-playDistAuthor!=''
    if (is.na(control)==FALSE){
      results<-c(x,author,genre,date,distance,playDistAuthor,playDistGenre,meanGenre)
      df_tragedy<-rbind(df_tragedy,results)
    }
  }
}
#give a name to the columns
x <- c("play","author", "genre","date","distance", "distance2author","distance2genre","meanGenre")
labelPoints<-rownames(df_tragedy)
colnames(df_tragedy) <- x

5.3.1 Distance to author

#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author))+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  theme_bw() +geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2author_tragedy_labels.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author), show.legend=FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_distance2author_tragedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.3.1.1 Voltaire

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 148] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_distance2author_tragedy_voltaire.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.3.1.2 Crébillon

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 131] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_distance2author_tragedy_crebillon.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.3.1.3 Racine

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 185] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2author)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the author")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_distance2author_tragedy_racine.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.3.2 Distance to genre

#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author))+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre") +
  theme_bw() +geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/distance2genre_tragedy_labels.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author), show.legend=FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre") +
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_distance2genre_tragedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.3.2.1 Voltaire

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 148] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_distance2genre_tragedy_voltaire.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.3.2.2 Crébillon

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 131] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_distance2genre_tragedy_crebillon.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.3.2.3 Racine

df_tragedy$color<-df_tragedy$author
df_tragedy$color<-as.numeric(df_tragedy$color)
df_tragedy$color[df_tragedy$color == 185] <- "blue"
df_tragedy$color[df_tragedy$color != "blue"] <- "white"
truc<-df_tragedy$color
#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance2genre)))+
  geom_point(stat = "summary", fun = "mean", aes(colour = color), show.legend = FALSE)+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance to the genre")+
  scale_color_manual(values =c("blue"="blue","white"="transparent"))+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_distance2genre_tragedy_racine.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

5.3.3 Distance between the two centroids

#vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance), label=rownames(df_tragedy)))+
vizEvoCentroid <- ggplot(data = df_tragedy, mapping = aes(x = as.numeric(date), y = as.numeric(distance), label=author))+
  geom_point(stat = "summary", fun = "mean", aes(colour = author))+ 
  geom_smooth(method = loess, size = 1)+
  xlab("Date") + ylab("Distance")+
  theme_bw() #+geom_text(hjust=0, vjust=0, size=3)
ggsave("R/images/test_spread_tragedy.png", plot=vizEvoCentroid, width = 2500, height = 2000, units = "px", dpi = 300)
vizEvoCentroid

Controlling variance

# with 4 plays
#x <- 1:71
x <- 1:nrow(df_tragedy)
y<-roll_var(as.numeric(df_tragedy$distance), width = 30, min_obs = 1)
lo <- loess(y~x)
plot(x,y,xlab="index", ylab="variance")
lines(predict(lo), col='red', lwd=2)

png("./R/images/variance2.png", width = 850, height = 600) 
plot(x,y,xlab="index", ylab="variance")
lines(predict(lo), col='red', lwd=2)
dev.off() 
## quartz_off_screen 
##                 2